1 Exploratory Data Analysis:

Review the structure and content of the data and answer questions such as: Are the features (columns) of your data correlated? What is the overall distribution of each variable? Are there any outliers present? What are the relationships between different variables? How are categorical variables distributed? Do any patterns or trends emerge in the data? What is the central tendency and spread of each variable? Are there any missing values and how significant are they?

# Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(ggpubr)
library(naniar)     # for missing value visualization
library(DataExplorer) # optional: automated EDA
# Load Dataset

url <- "https://raw.githubusercontent.com/uzmabb182/Data_622/refs/heads/main/Assignment_1_EDA/bank-additional-full.csv"
bank_additional_df <- read.csv2(url, stringsAsFactors = FALSE)
head(bank_additional_df)
##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         93.994         -36.4     4.857        5191 no
## 2         93.994         -36.4     4.857        5191 no
## 3         93.994         -36.4     4.857        5191 no
## 4         93.994         -36.4     4.857        5191 no
## 5         93.994         -36.4     4.857        5191 no
## 6         93.994         -36.4     4.857        5191 no
# Basic structure
str(bank_additional_df)
## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr  "housemaid" "services" "services" "admin." ...
##  $ marital       : chr  "married" "married" "married" "married" ...
##  $ education     : chr  "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr  "no" "unknown" "no" "no" ...
##  $ housing       : chr  "no" "no" "yes" "no" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "may" "may" ...
##  $ day_of_week   : chr  "mon" "mon" "mon" "mon" ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : chr  "1.1" "1.1" "1.1" "1.1" ...
##  $ cons.price.idx: chr  "93.994" "93.994" "93.994" "93.994" ...
##  $ cons.conf.idx : chr  "-36.4" "-36.4" "-36.4" "-36.4" ...
##  $ euribor3m     : chr  "4.857" "4.857" "4.857" "4.857" ...
##  $ nr.employed   : chr  "5191" "5191" "5191" "5191" ...
##  $ y             : chr  "no" "no" "no" "no" ...
# Dimensions
dim(bank_additional_df)   # rows, columns
## [1] 41188    21
nrow(bank_additional_df)  # number of rows
## [1] 41188
ncol(bank_additional_df)  # number of columns
## [1] 21
# Column names
names(bank_additional_df)
##  [1] "age"            "job"            "marital"        "education"     
##  [5] "default"        "housing"        "loan"           "contact"       
##  [9] "month"          "day_of_week"    "duration"       "campaign"      
## [13] "pdays"          "previous"       "poutcome"       "emp.var.rate"  
## [17] "cons.price.idx" "cons.conf.idx"  "euribor3m"      "nr.employed"   
## [21] "y"
# Summary statistics for all variables
summary(bank_additional_df)
##       age            job              marital           education        
##  Min.   :17.00   Length:41188       Length:41188       Length:41188      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.02                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :98.00                                                           
##    default            housing              loan             contact         
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:41188       Length:41188       Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 102.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 180.0   Median : 2.000  
##                                        Mean   : 258.3   Mean   : 2.568  
##                                        3rd Qu.: 319.0   3rd Qu.: 3.000  
##                                        Max.   :4918.0   Max.   :56.000  
##      pdays          previous       poutcome         emp.var.rate      
##  Min.   :  0.0   Min.   :0.000   Length:41188       Length:41188      
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   Class :character  
##  Median :999.0   Median :0.000   Mode  :character   Mode  :character  
##  Mean   :962.5   Mean   :0.173                                        
##  3rd Qu.:999.0   3rd Qu.:0.000                                        
##  Max.   :999.0   Max.   :7.000                                        
##  cons.price.idx     cons.conf.idx       euribor3m         nr.employed       
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       y            
##  Length:41188      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# First and last few records
head(bank_additional_df, 10)
##    age         job marital           education default housing loan   contact
## 1   56   housemaid married            basic.4y      no      no   no telephone
## 2   57    services married         high.school unknown      no   no telephone
## 3   37    services married         high.school      no     yes   no telephone
## 4   40      admin. married            basic.6y      no      no   no telephone
## 5   56    services married         high.school      no      no  yes telephone
## 6   45    services married            basic.9y unknown      no   no telephone
## 7   59      admin. married professional.course      no      no   no telephone
## 8   41 blue-collar married             unknown unknown      no   no telephone
## 9   24  technician  single professional.course      no     yes   no telephone
## 10  25    services  single         high.school      no     yes   no telephone
##    month day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1    may         mon      261        1   999        0 nonexistent          1.1
## 2    may         mon      149        1   999        0 nonexistent          1.1
## 3    may         mon      226        1   999        0 nonexistent          1.1
## 4    may         mon      151        1   999        0 nonexistent          1.1
## 5    may         mon      307        1   999        0 nonexistent          1.1
## 6    may         mon      198        1   999        0 nonexistent          1.1
## 7    may         mon      139        1   999        0 nonexistent          1.1
## 8    may         mon      217        1   999        0 nonexistent          1.1
## 9    may         mon      380        1   999        0 nonexistent          1.1
## 10   may         mon       50        1   999        0 nonexistent          1.1
##    cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1          93.994         -36.4     4.857        5191 no
## 2          93.994         -36.4     4.857        5191 no
## 3          93.994         -36.4     4.857        5191 no
## 4          93.994         -36.4     4.857        5191 no
## 5          93.994         -36.4     4.857        5191 no
## 6          93.994         -36.4     4.857        5191 no
## 7          93.994         -36.4     4.857        5191 no
## 8          93.994         -36.4     4.857        5191 no
## 9          93.994         -36.4     4.857        5191 no
## 10         93.994         -36.4     4.857        5191 no
tail(bank_additional_df, 10)
##       age         job  marital           education default housing loan
## 41179  62     retired  married   university.degree      no      no   no
## 41180  64     retired divorced professional.course      no     yes   no
## 41181  36      admin.  married   university.degree      no      no   no
## 41182  37      admin.  married   university.degree      no     yes   no
## 41183  29  unemployed   single            basic.4y      no     yes   no
## 41184  73     retired  married professional.course      no     yes   no
## 41185  46 blue-collar  married professional.course      no      no   no
## 41186  56     retired  married   university.degree      no     yes   no
## 41187  44  technician  married professional.course      no      no   no
## 41188  74     retired  married professional.course      no     yes   no
##        contact month day_of_week duration campaign pdays previous    poutcome
## 41179 cellular   nov         thu      483        2     6        3     success
## 41180 cellular   nov         fri      151        3   999        0 nonexistent
## 41181 cellular   nov         fri      254        2   999        0 nonexistent
## 41182 cellular   nov         fri      281        1   999        0 nonexistent
## 41183 cellular   nov         fri      112        1     9        1     success
## 41184 cellular   nov         fri      334        1   999        0 nonexistent
## 41185 cellular   nov         fri      383        1   999        0 nonexistent
## 41186 cellular   nov         fri      189        2   999        0 nonexistent
## 41187 cellular   nov         fri      442        1   999        0 nonexistent
## 41188 cellular   nov         fri      239        3   999        1     failure
##       emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed   y
## 41179         -1.1         94.767         -50.8     1.031      4963.6 yes
## 41180         -1.1         94.767         -50.8     1.028      4963.6  no
## 41181         -1.1         94.767         -50.8     1.028      4963.6  no
## 41182         -1.1         94.767         -50.8     1.028      4963.6 yes
## 41183         -1.1         94.767         -50.8     1.028      4963.6  no
## 41184         -1.1         94.767         -50.8     1.028      4963.6 yes
## 41185         -1.1         94.767         -50.8     1.028      4963.6  no
## 41186         -1.1         94.767         -50.8     1.028      4963.6  no
## 41187         -1.1         94.767         -50.8     1.028      4963.6 yes
## 41188         -1.1         94.767         -50.8     1.028      4963.6  no
# Missing values per column
missing_summary <- bank_additional_df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Missing_Count") %>%
  mutate(Missing_Percent = round(Missing_Count / nrow(bank_additional_df) * 100, 2)) %>%
  arrange(desc(Missing_Count))

missing_summary
## # A tibble: 21 × 3
##    Variable    Missing_Count Missing_Percent
##    <chr>               <int>           <dbl>
##  1 age                     0               0
##  2 job                     0               0
##  3 marital                 0               0
##  4 education               0               0
##  5 default                 0               0
##  6 housing                 0               0
##  7 loan                    0               0
##  8 contact                 0               0
##  9 month                   0               0
## 10 day_of_week             0               0
## # ℹ 11 more rows
# Unique values in categorical variables (factor/character columns)
lapply(bank_additional_df[sapply(bank_additional_df, is.character)], unique)
## $job
##  [1] "housemaid"     "services"      "admin."        "blue-collar"  
##  [5] "technician"    "retired"       "management"    "unemployed"   
##  [9] "self-employed" "unknown"       "entrepreneur"  "student"      
## 
## $marital
## [1] "married"  "single"   "divorced" "unknown" 
## 
## $education
## [1] "basic.4y"            "high.school"         "basic.6y"           
## [4] "basic.9y"            "professional.course" "unknown"            
## [7] "university.degree"   "illiterate"         
## 
## $default
## [1] "no"      "unknown" "yes"    
## 
## $housing
## [1] "no"      "yes"     "unknown"
## 
## $loan
## [1] "no"      "yes"     "unknown"
## 
## $contact
## [1] "telephone" "cellular" 
## 
## $month
##  [1] "may" "jun" "jul" "aug" "oct" "nov" "dec" "mar" "apr" "sep"
## 
## $day_of_week
## [1] "mon" "tue" "wed" "thu" "fri"
## 
## $poutcome
## [1] "nonexistent" "failure"     "success"    
## 
## $emp.var.rate
##  [1] "1.1"  "1.4"  "-0.1" "-0.2" "-1.8" "-2.9" "-3.4" "-3"   "-1.7" "-1.1"
## 
## $cons.price.idx
##  [1] "93.994" "94.465" "93.918" "93.444" "93.798" "93.2"   "92.756" "92.843"
##  [9] "93.075" "92.893" "92.963" "92.469" "92.201" "92.379" "92.431" "92.649"
## [17] "92.713" "93.369" "93.749" "93.876" "94.055" "94.215" "94.027" "94.199"
## [25] "94.601" "94.767"
## 
## $cons.conf.idx
##  [1] "-36.4" "-41.8" "-42.7" "-36.1" "-40.4" "-42"   "-45.9" "-50"   "-47.1"
## [10] "-46.2" "-40.8" "-33.6" "-31.4" "-29.8" "-26.9" "-30.1" "-33"   "-34.8"
## [19] "-34.6" "-40"   "-39.8" "-40.3" "-38.3" "-37.5" "-49.5" "-50.8"
## 
## $euribor3m
##   [1] "4.857" "4.856" "4.855" "4.859" "4.86"  "4.858" "4.864" "4.865" "4.866"
##  [10] "4.967" "4.961" "4.959" "4.958" "4.96"  "4.962" "4.955" "4.947" "4.956"
##  [19] "4.966" "4.963" "4.957" "4.968" "4.97"  "4.965" "4.964" "5.045" "5"    
##  [28] "4.936" "4.921" "4.918" "4.912" "4.827" "4.794" "4.76"  "4.733" "4.7"  
##  [37] "4.663" "4.592" "4.474" "4.406" "4.343" "4.286" "4.245" "4.223" "4.191"
##  [46] "4.153" "4.12"  "4.076" "4.021" "3.901" "3.879" "3.853" "3.816" "3.743"
##  [55] "3.669" "3.563" "3.488" "3.428" "3.329" "3.282" "3.053" "1.811" "1.799"
##  [64] "1.778" "1.757" "1.726" "1.703" "1.687" "1.663" "1.65"  "1.64"  "1.629"
##  [73] "1.614" "1.602" "1.584" "1.574" "1.56"  "1.556" "1.548" "1.538" "1.531"
##  [82] "1.52"  "1.51"  "1.498" "1.483" "1.479" "1.466" "1.453" "1.445" "1.435"
##  [91] "1.423" "1.415" "1.41"  "1.405" "1.406" "1.4"   "1.392" "1.384" "1.372"
## [100] "1.365" "1.354" "1.344" "1.334" "1.327" "1.313" "1.299" "1.291" "1.281"
## [109] "1.266" "1.25"  "1.244" "1.259" "1.264" "1.27"  "1.262" "1.26"  "1.268"
## [118] "1.286" "1.252" "1.235" "1.224" "1.215" "1.206" "1.099" "1.085" "1.072"
## [127] "1.059" "1.048" "1.044" "1.029" "1.018" "1.007" "0.996" "0.979" "0.969"
## [136] "0.944" "0.937" "0.933" "0.927" "0.921" "0.914" "0.908" "0.903" "0.899"
## [145] "0.884" "0.883" "0.881" "0.879" "0.873" "0.869" "0.861" "0.859" "0.854"
## [154] "0.851" "0.849" "0.843" "0.838" "0.834" "0.829" "0.825" "0.821" "0.819"
## [163] "0.813" "0.809" "0.803" "0.797" "0.788" "0.781" "0.778" "0.773" "0.771"
## [172] "0.77"  "0.768" "0.766" "0.762" "0.755" "0.749" "0.743" "0.741" "0.739"
## [181] "0.75"  "0.753" "0.754" "0.752" "0.744" "0.74"  "0.742" "0.737" "0.735"
## [190] "0.733" "0.73"  "0.731" "0.728" "0.724" "0.722" "0.72"  "0.719" "0.716"
## [199] "0.715" "0.714" "0.718" "0.721" "0.717" "0.712" "0.71"  "0.709" "0.708"
## [208] "0.706" "0.707" "0.7"   "0.655" "0.654" "0.653" "0.652" "0.651" "0.65" 
## [217] "0.649" "0.646" "0.644" "0.643" "0.639" "0.637" "0.635" "0.636" "0.634"
## [226] "0.638" "0.64"  "0.642" "0.645" "0.659" "0.663" "0.668" "0.672" "0.677"
## [235] "0.682" "0.683" "0.684" "0.685" "0.688" "0.69"  "0.692" "0.695" "0.697"
## [244] "0.699" "0.701" "0.702" "0.704" "0.711" "0.713" "0.723" "0.727" "0.729"
## [253] "0.732" "0.748" "0.761" "0.767" "0.782" "0.79"  "0.793" "0.802" "0.81" 
## [262] "0.822" "0.827" "0.835" "0.84"  "0.846" "0.87"  "0.876" "0.885" "0.889"
## [271] "0.893" "0.896" "0.898" "0.9"   "0.904" "0.905" "0.895" "0.894" "0.891"
## [280] "0.89"  "0.888" "0.886" "0.882" "0.88"  "0.878" "0.877" "0.942" "0.953"
## [289] "0.956" "0.959" "0.965" "0.972" "0.977" "0.982" "0.985" "0.987" "0.993"
## [298] "1"     "1.008" "1.016" "1.025" "1.032" "1.037" "1.043" "1.045" "1.047"
## [307] "1.05"  "1.049" "1.046" "1.041" "1.04"  "1.039" "1.035" "1.03"  "1.031"
## [316] "1.028"
## 
## $nr.employed
##  [1] "5191"   "5228.1" "5195.8" "5176.3" "5099.1" "5076.2" "5017.5" "5023.5"
##  [9] "5008.7" "4991.6" "4963.6"
## 
## $y
## [1] "no"  "yes"
library(dplyr)
library(tidyr)

# Select only character (categorical) columns
categorical_df <- bank_additional_df %>% select(where(is.character))

# Create readable frequency tables
freq_tables <- lapply(names(categorical_df), function(col) {
  categorical_df %>%
    count(!!sym(col)) %>%      # count occurrences of each unique value
    arrange(desc(n)) %>%       # sort descending
    rename(Value = !!sym(col),
           Frequency = n) %>%
    mutate(Variable = col) %>% # add column name
    select(Variable, Value, Frequency)
})

# Combine all into one data frame
freq_tables_df <- bind_rows(freq_tables)

# View
freq_tables_df
##           Variable               Value Frequency
## 1              job              admin.     10422
## 2              job         blue-collar      9254
## 3              job          technician      6743
## 4              job            services      3969
## 5              job          management      2924
## 6              job             retired      1720
## 7              job        entrepreneur      1456
## 8              job       self-employed      1421
## 9              job           housemaid      1060
## 10             job          unemployed      1014
## 11             job             student       875
## 12             job             unknown       330
## 13         marital             married     24928
## 14         marital              single     11568
## 15         marital            divorced      4612
## 16         marital             unknown        80
## 17       education   university.degree     12168
## 18       education         high.school      9515
## 19       education            basic.9y      6045
## 20       education professional.course      5243
## 21       education            basic.4y      4176
## 22       education            basic.6y      2292
## 23       education             unknown      1731
## 24       education          illiterate        18
## 25         default                  no     32588
## 26         default             unknown      8597
## 27         default                 yes         3
## 28         housing                 yes     21576
## 29         housing                  no     18622
## 30         housing             unknown       990
## 31            loan                  no     33950
## 32            loan                 yes      6248
## 33            loan             unknown       990
## 34         contact            cellular     26144
## 35         contact           telephone     15044
## 36           month                 may     13769
## 37           month                 jul      7174
## 38           month                 aug      6178
## 39           month                 jun      5318
## 40           month                 nov      4101
## 41           month                 apr      2632
## 42           month                 oct       718
## 43           month                 sep       570
## 44           month                 mar       546
## 45           month                 dec       182
## 46     day_of_week                 thu      8623
## 47     day_of_week                 mon      8514
## 48     day_of_week                 wed      8134
## 49     day_of_week                 tue      8090
## 50     day_of_week                 fri      7827
## 51        poutcome         nonexistent     35563
## 52        poutcome             failure      4252
## 53        poutcome             success      1373
## 54    emp.var.rate                 1.4     16234
## 55    emp.var.rate                -1.8      9184
## 56    emp.var.rate                 1.1      7763
## 57    emp.var.rate                -0.1      3683
## 58    emp.var.rate                -2.9      1663
## 59    emp.var.rate                -3.4      1071
## 60    emp.var.rate                -1.7       773
## 61    emp.var.rate                -1.1       635
## 62    emp.var.rate                  -3       172
## 63    emp.var.rate                -0.2        10
## 64  cons.price.idx              93.994      7763
## 65  cons.price.idx              93.918      6685
## 66  cons.price.idx              92.893      5794
## 67  cons.price.idx              93.444      5175
## 68  cons.price.idx              94.465      4374
## 69  cons.price.idx                93.2      3616
## 70  cons.price.idx              93.075      2458
## 71  cons.price.idx              92.201       770
## 72  cons.price.idx              92.963       715
## 73  cons.price.idx              92.431       447
## 74  cons.price.idx              92.649       357
## 75  cons.price.idx              94.215       311
## 76  cons.price.idx              94.199       303
## 77  cons.price.idx              92.843       282
## 78  cons.price.idx              92.379       267
## 79  cons.price.idx              93.369       264
## 80  cons.price.idx              94.027       233
## 81  cons.price.idx              94.055       229
## 82  cons.price.idx              93.876       212
## 83  cons.price.idx              94.601       204
## 84  cons.price.idx              92.469       178
## 85  cons.price.idx              93.749       174
## 86  cons.price.idx              92.713       172
## 87  cons.price.idx              94.767       128
## 88  cons.price.idx              93.798        67
## 89  cons.price.idx              92.756        10
## 90   cons.conf.idx               -36.4      7763
## 91   cons.conf.idx               -42.7      6685
## 92   cons.conf.idx               -46.2      5794
## 93   cons.conf.idx               -36.1      5175
## 94   cons.conf.idx               -41.8      4374
## 95   cons.conf.idx                 -42      3616
## 96   cons.conf.idx               -47.1      2458
## 97   cons.conf.idx               -31.4       770
## 98   cons.conf.idx               -40.8       715
## 99   cons.conf.idx               -26.9       447
## 100  cons.conf.idx               -30.1       357
## 101  cons.conf.idx               -40.3       311
## 102  cons.conf.idx               -37.5       303
## 103  cons.conf.idx                 -50       282
## 104  cons.conf.idx               -29.8       267
## 105  cons.conf.idx               -34.8       264
## 106  cons.conf.idx               -38.3       233
## 107  cons.conf.idx               -39.8       229
## 108  cons.conf.idx                 -40       212
## 109  cons.conf.idx               -49.5       204
## 110  cons.conf.idx               -33.6       178
## 111  cons.conf.idx               -34.6       174
## 112  cons.conf.idx                 -33       172
## 113  cons.conf.idx               -50.8       128
## 114  cons.conf.idx               -40.4        67
## 115  cons.conf.idx               -45.9        10
## 116      euribor3m               4.857      2868
## 117      euribor3m               4.962      2613
## 118      euribor3m               4.963      2487
## 119      euribor3m               4.961      1902
## 120      euribor3m               4.856      1210
## 121      euribor3m               4.964      1175
## 122      euribor3m               1.405      1169
## 123      euribor3m               4.965      1071
## 124      euribor3m               4.864      1044
## 125      euribor3m                4.96      1013
## 126      euribor3m               4.968       992
## 127      euribor3m               4.959       895
## 128      euribor3m                4.86       892
## 129      euribor3m               4.855       840
## 130      euribor3m               4.076       822
## 131      euribor3m               1.266       820
## 132      euribor3m               4.859       788
## 133      euribor3m                4.12       756
## 134      euribor3m               4.858       733
## 135      euribor3m               4.153       690
## 136      euribor3m               4.021       676
## 137      euribor3m               4.967       643
## 138      euribor3m               1.281       637
## 139      euribor3m               4.966       622
## 140      euribor3m               4.191       610
## 141      euribor3m                1.25       587
## 142      euribor3m               4.958       581
## 143      euribor3m               1.291       544
## 144      euribor3m               1.327       538
## 145      euribor3m               4.957       537
## 146      euribor3m               1.299       520
## 147      euribor3m               1.313       492
## 148      euribor3m               1.334       482
## 149      euribor3m               1.244       422
## 150      euribor3m               1.344       395
## 151      euribor3m               4.865       373
## 152      euribor3m               4.866       340
## 153      euribor3m               1.365       303
## 154      euribor3m                1.41       254
## 155      euribor3m                1.26       252
## 156      euribor3m               1.354       215
## 157      euribor3m               0.879       180
## 158      euribor3m                4.97       172
## 159      euribor3m               1.262       145
## 160      euribor3m               0.714       139
## 161      euribor3m               0.715       135
## 162      euribor3m               0.884       128
## 163      euribor3m               0.883       124
## 164      euribor3m                1.27       110
## 165      euribor3m               1.445       103
## 166      euribor3m               4.955       103
## 167      euribor3m               1.415        98
## 168      euribor3m               4.947        98
## 169      euribor3m               1.268        95
## 170      euribor3m               1.264        87
## 171      euribor3m               1.423        87
## 172      euribor3m               0.739        82
## 173      euribor3m               0.873        82
## 174      euribor3m               1.435        81
## 175      euribor3m               1.453        81
## 176      euribor3m               0.881        79
## 177      euribor3m                0.72        78
## 178      euribor3m               0.722        74
## 179      euribor3m               1.259        70
## 180      euribor3m               0.742        68
## 181      euribor3m               0.861        65
## 182      euribor3m               1.479        62
## 183      euribor3m               0.904        60
## 184      euribor3m               1.466        57
## 185      euribor3m               0.716        54
## 186      euribor3m               0.869        54
## 187      euribor3m               0.899        50
## 188      euribor3m               1.483        50
## 189      euribor3m               0.646        49
## 190      euribor3m               0.886        48
## 191      euribor3m                0.74        45
## 192      euribor3m               0.754        44
## 193      euribor3m               1.029        44
## 194      euribor3m               0.635        43
## 195      euribor3m               0.682        39
## 196      euribor3m               0.898        39
## 197      euribor3m               0.644        38
## 198      euribor3m               0.797        38
## 199      euribor3m               0.896        37
## 200      euribor3m               1.044        37
## 201      euribor3m               0.642        35
## 202      euribor3m               0.652        35
## 203      euribor3m               0.728        35
## 204      euribor3m               0.849        35
## 205      euribor3m               0.859        35
## 206      euribor3m               1.498        35
## 207      euribor3m               0.655        34
## 208      euribor3m               1.072        34
## 209      euribor3m               0.878        33
## 210      euribor3m               0.803        31
## 211      euribor3m               0.876        31
## 212      euribor3m               1.811        31
## 213      euribor3m               0.719        30
## 214      euribor3m               0.854        30
## 215      euribor3m               0.838        29
## 216      euribor3m               1.531        29
## 217      euribor3m               0.699        28
## 218      euribor3m               0.741        27
## 219      euribor3m               0.825        27
## 220      euribor3m               0.851        27
## 221      euribor3m                 0.9        27
## 222      euribor3m               0.645        26
## 223      euribor3m               0.707        26
## 224      euribor3m               1.252        26
## 225      euribor3m               0.737        25
## 226      euribor3m               0.882        25
## 227      euribor3m               1.406        25
## 228      euribor3m                0.73        24
## 229      euribor3m               0.821        24
## 230      euribor3m               0.827        24
## 231      euribor3m               0.643        23
## 232      euribor3m               0.697        23
## 233      euribor3m               0.724        23
## 234      euribor3m               1.059        23
## 235      euribor3m               4.956        23
## 236      euribor3m               0.702        22
## 237      euribor3m               0.761        22
## 238      euribor3m               0.773        22
## 239      euribor3m               0.819        22
## 240      euribor3m               1.048        22
## 241      euribor3m               1.687        22
## 242      euribor3m               0.735        21
## 243      euribor3m               0.781        21
## 244      euribor3m               0.809        21
## 245      euribor3m               0.846        21
## 246      euribor3m               0.977        21
## 247      euribor3m                1.05        21
## 248      euribor3m               1.392        21
## 249      euribor3m               0.654        20
## 250      euribor3m                0.77        20
## 251      euribor3m               0.788        20
## 252      euribor3m               0.835        20
## 253      euribor3m               0.877        20
## 254      euribor3m                0.88        20
## 255      euribor3m               1.215        20
## 256      euribor3m               1.663        20
## 257      euribor3m               1.757        20
## 258      euribor3m               0.653        19
## 259      euribor3m                0.81        19
## 260      euribor3m               0.987        19
## 261      euribor3m               0.668        18
## 262      euribor3m               0.706        18
## 263      euribor3m               0.717        18
## 264      euribor3m               0.718        18
## 265      euribor3m               0.733        18
## 266      euribor3m                0.84        18
## 267      euribor3m                   1        18
## 268      euribor3m               0.743        17
## 269      euribor3m               0.744        17
## 270      euribor3m               0.767        17
## 271      euribor3m               0.889        17
## 272      euribor3m               0.905        17
## 273      euribor3m               0.972        17
## 274      euribor3m                1.52        17
## 275      euribor3m               1.538        17
## 276      euribor3m               0.639        16
## 277      euribor3m               0.672        16
## 278      euribor3m               0.684        16
## 279      euribor3m               0.843        16
## 280      euribor3m               0.908        16
## 281      euribor3m               0.959        16
## 282      euribor3m               1.032        16
## 283      euribor3m               1.286        16
## 284      euribor3m               0.659        15
## 285      euribor3m               0.731        15
## 286      euribor3m               0.982        15
## 287      euribor3m               1.046        15
## 288      euribor3m               0.636        14
## 289      euribor3m               0.683        14
## 290      euribor3m               1.025        14
## 291      euribor3m               1.799        14
## 292      euribor3m               0.729        13
## 293      euribor3m               0.768        13
## 294      euribor3m               0.829        13
## 295      euribor3m               0.834        13
## 296      euribor3m                0.87        13
## 297      euribor3m               0.893        13
## 298      euribor3m               1.049        13
## 299      euribor3m                 1.4        13
## 300      euribor3m               1.614        13
## 301      euribor3m                0.65        12
## 302      euribor3m               0.677        12
## 303      euribor3m               0.748        12
## 304      euribor3m               0.903        12
## 305      euribor3m               1.556        12
## 306      euribor3m               0.663        11
## 307      euribor3m               0.701        11
## 308      euribor3m               0.782        11
## 309      euribor3m                0.79        11
## 310      euribor3m               0.822        11
## 311      euribor3m               1.099        11
## 312      euribor3m                1.51        11
## 313      euribor3m               1.726        11
## 314      euribor3m                0.64        10
## 315      euribor3m               0.649        10
## 316      euribor3m               0.692        10
## 317      euribor3m               0.695        10
## 318      euribor3m               0.712        10
## 319      euribor3m               0.723        10
## 320      euribor3m               0.885        10
## 321      euribor3m                1.04        10
## 322      euribor3m               1.372        10
## 323      euribor3m               1.629        10
## 324      euribor3m                1.64        10
## 325      euribor3m               0.685         9
## 326      euribor3m                0.71         9
## 327      euribor3m               0.713         9
## 328      euribor3m               0.793         9
## 329      euribor3m               1.016         9
## 330      euribor3m               1.028         9
## 331      euribor3m               1.039         9
## 332      euribor3m               1.041         9
## 333      euribor3m               1.043         9
## 334      euribor3m               1.206         9
## 335      euribor3m               1.235         9
## 336      euribor3m               1.384         9
## 337      euribor3m               4.245         9
## 338      euribor3m               4.663         9
## 339      euribor3m               5.045         9
## 340      euribor3m               0.634         8
## 341      euribor3m               0.709         8
## 342      euribor3m                0.89         8
## 343      euribor3m               1.031         8
## 344      euribor3m                1.56         8
## 345      euribor3m               1.602         8
## 346      euribor3m                1.65         8
## 347      euribor3m               1.703         8
## 348      euribor3m                 4.7         8
## 349      euribor3m               0.638         7
## 350      euribor3m               0.651         7
## 351      euribor3m               0.704         7
## 352      euribor3m                0.75         7
## 353      euribor3m               0.753         7
## 354      euribor3m               0.755         7
## 355      euribor3m               0.778         7
## 356      euribor3m               0.802         7
## 357      euribor3m               0.942         7
## 358      euribor3m               0.985         7
## 359      euribor3m               1.035         7
## 360      euribor3m               1.085         7
## 361      euribor3m               1.224         7
## 362      euribor3m               4.286         7
## 363      euribor3m               4.406         7
## 364      euribor3m               4.912         7
## 365      euribor3m                   5         7
## 366      euribor3m               0.637         6
## 367      euribor3m               0.708         6
## 368      euribor3m               0.721         6
## 369      euribor3m               0.732         6
## 370      euribor3m               0.771         6
## 371      euribor3m               0.813         6
## 372      euribor3m                1.03         6
## 373      euribor3m               1.037         6
## 374      euribor3m               1.548         6
## 375      euribor3m               4.936         6
## 376      euribor3m                 0.7         5
## 377      euribor3m               0.727         5
## 378      euribor3m               0.752         5
## 379      euribor3m               0.888         5
## 380      euribor3m               0.965         5
## 381      euribor3m               0.993         5
## 382      euribor3m               1.008         5
## 383      euribor3m               4.343         5
## 384      euribor3m               4.794         5
## 385      euribor3m               4.827         5
## 386      euribor3m               0.711         4
## 387      euribor3m               0.762         4
## 388      euribor3m               0.891         4
## 389      euribor3m               4.223         4
## 390      euribor3m               4.592         4
## 391      euribor3m               4.918         4
## 392      euribor3m               0.688         3
## 393      euribor3m                0.69         3
## 394      euribor3m               0.766         3
## 395      euribor3m               0.894         3
## 396      euribor3m               0.895         3
## 397      euribor3m               0.914         3
## 398      euribor3m               0.944         3
## 399      euribor3m               0.979         3
## 400      euribor3m               1.007         3
## 401      euribor3m               1.018         3
## 402      euribor3m               1.584         3
## 403      euribor3m               1.778         3
## 404      euribor3m               4.474         3
## 405      euribor3m                4.76         3
## 406      euribor3m               4.921         3
## 407      euribor3m               0.749         2
## 408      euribor3m               0.921         2
## 409      euribor3m               0.927         2
## 410      euribor3m               0.937         2
## 411      euribor3m               0.953         2
## 412      euribor3m               3.563         2
## 413      euribor3m               3.879         2
## 414      euribor3m               4.733         2
## 415      euribor3m               0.933         1
## 416      euribor3m               0.956         1
## 417      euribor3m               0.969         1
## 418      euribor3m               0.996         1
## 419      euribor3m               1.045         1
## 420      euribor3m               1.047         1
## 421      euribor3m               1.574         1
## 422      euribor3m               3.053         1
## 423      euribor3m               3.282         1
## 424      euribor3m               3.329         1
## 425      euribor3m               3.428         1
## 426      euribor3m               3.488         1
## 427      euribor3m               3.669         1
## 428      euribor3m               3.743         1
## 429      euribor3m               3.816         1
## 430      euribor3m               3.853         1
## 431      euribor3m               3.901         1
## 432    nr.employed              5228.1     16234
## 433    nr.employed              5099.1      8534
## 434    nr.employed                5191      7763
## 435    nr.employed              5195.8      3683
## 436    nr.employed              5076.2      1663
## 437    nr.employed              5017.5      1071
## 438    nr.employed              4991.6       773
## 439    nr.employed              5008.7       650
## 440    nr.employed              4963.6       635
## 441    nr.employed              5023.5       172
## 442    nr.employed              5176.3        10
## 443              y                  no     36548
## 444              y                 yes      4640
# Quick numeric summaries (mean, sd, min, max, quantiles)
sapply(bank_additional_df[sapply(bank_additional_df, is.numeric)], function(x) {
  c(mean = mean(x, na.rm = TRUE),
    sd = sd(x, na.rm = TRUE),
    min = min(x, na.rm = TRUE),
    q25 = quantile(x, 0.25, na.rm = TRUE),
    median = median(x, na.rm = TRUE),
    q75 = quantile(x, 0.75, na.rm = TRUE),
    max = max(x, na.rm = TRUE))
})
##              age  duration  campaign    pdays  previous
## mean    40.02406  258.2850  2.567593 962.4755 0.1729630
## sd      10.42125  259.2792  2.770014 186.9109 0.4949011
## min     17.00000    0.0000  1.000000   0.0000 0.0000000
## q25.25% 32.00000  102.0000  1.000000 999.0000 0.0000000
## median  38.00000  180.0000  2.000000 999.0000 0.0000000
## q75.75% 47.00000  319.0000  3.000000 999.0000 0.0000000
## max     98.00000 4918.0000 56.000000 999.0000 7.0000000
# Central tendency & spread (numerical variables)

numeric_vars <- bank_additional_df %>% select(where(is.numeric))

summary(numeric_vars)
##       age           duration         campaign          pdays      
##  Min.   :17.00   Min.   :   0.0   Min.   : 1.000   Min.   :  0.0  
##  1st Qu.:32.00   1st Qu.: 102.0   1st Qu.: 1.000   1st Qu.:999.0  
##  Median :38.00   Median : 180.0   Median : 2.000   Median :999.0  
##  Mean   :40.02   Mean   : 258.3   Mean   : 2.568   Mean   :962.5  
##  3rd Qu.:47.00   3rd Qu.: 319.0   3rd Qu.: 3.000   3rd Qu.:999.0  
##  Max.   :98.00   Max.   :4918.0   Max.   :56.000   Max.   :999.0  
##     previous    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.173  
##  3rd Qu.:0.000  
##  Max.   :7.000
# Boxplots to check central tendency and outliers
numeric_vars_long <- numeric_vars %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(numeric_vars_long, aes(x = Variable, y = Value)) +
  geom_boxplot(fill = "skyblue") +
  theme_minimal() +
  ggtitle("Boxplots of Numeric Variables (Central Tendency & Outliers)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Overall distribution of each variable

# Histograms for numeric variables
ggplot(numeric_vars_long, aes(x = Value)) +
  geom_histogram(fill = "lightgreen", color = "black", bins = 30) +
  facet_wrap(~ Variable, scales = "free") +
  theme_minimal() +
  ggtitle("Histograms of Numeric Variables")

# Patterns or trends in data

# Example: average duration by job
bank_additional_df %>%
  group_by(job) %>%
  summarise(Average_Duration = mean(duration, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(job, Average_Duration), y = Average_Duration)) +
  geom_bar(stat = "identity", fill = "purple") +
  coord_flip() +
  theme_minimal() +
  ggtitle("Average Call Duration by Job")

# Correlation between numeric variables

corr_matrix <- cor(numeric_vars, use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black")

# Correlation matrix for numeric columns
numeric_vars <- bank_additional_df[sapply(bank_additional_df, is.numeric)]
cor(numeric_vars, use = "complete.obs")
##                   age     duration    campaign       pdays    previous
## age       1.000000000 -0.000865705  0.00459358 -0.03436895  0.02436474
## duration -0.000865705  1.000000000 -0.07169923 -0.04757702  0.02064035
## campaign  0.004593580 -0.071699226  1.00000000  0.05258357 -0.07914147
## pdays    -0.034368951 -0.047577015  0.05258357  1.00000000 -0.58751386
## previous  0.024364741  0.020640351 -0.07914147 -0.58751386  1.00000000

age & duration (-0.00087): Essentially zero correlation; age of client has no linear relationship with call duration. age & campaign (0.0046): Nearly zero; older clients are not contacted more or less often. age & pdays (-0.034): Very weak negative correlation; older clients slightly more likely to have been contacted recently in previous campaigns, but effect is negligible. age & previous (0.024): Essentially no correlation; age does not relate to prior contacts. duration & campaign (-0.072): Very weak negative correlation; longer calls slightly associated with fewer calls in this campaign. duration & pdays (-0.048): Very weak negative correlation; call duration not meaningfully related to days since last contact. duration & previous (0.021): Almost zero; prior contacts do not affect call length. campaign & pdays (0.053): Very weak positive correlation; number of contacts in this campaign is slightly higher for clients contacted longer ago. campaign & previous (-0.079): Very weak negative correlation; more prior contacts slightly associated with fewer contacts in this campaign. pdays & previous (-0.588): Moderate to strong negative correlation; as days since last contact (pdays) increases, the number of prior contacts decreases. Makes sense: if someone was contacted long ago, there were fewer previous contacts.

Key takeaways:

Most variables have very weak correlations (close to 0), meaning they are largely independent.

The only strong relationship is between pdays and previous (-0.588), which is meaningful for modeling.

Variables like age, duration, and campaign are not strongly correlated with each other, so multicollinearity is unlikely among them.

# Relationships between different variables

# Scatterplot matrix for numeric variables
pairs(numeric_vars, main = "Scatterplot Matrix of Numeric Variables", pch = 19, col = "blue")

# Or using ggplot for two selected variables
ggplot(bank_additional_df, aes(x = pdays, y = previous)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", col = "red") +
  theme_minimal() +
  ggtitle("Relationship: pdays vs previous")
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plots for numeric variables (interactive)

# Example: campaign vs duration
plot_ly(bank_additional_df, 
        x = ~campaign, 
        y = ~duration,
        type = 'scatter',
        mode = 'markers',
        color = ~job,        # optional: color by a categorical variable
        text = ~paste("Age:", age, "<br>Previous:", previous),
        marker = list(size = 10, opacity = 0.7)) %>%
  layout(title = "Relationship: Campaign vs Duration",
         xaxis = list(title = "Number of Contacts in Campaign"),
         yaxis = list(title = "Call Duration (seconds)"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
cor_matrix <- round(cor(numeric_vars, use = "complete.obs"), 2)
plot_cor <- plot_ly(x = colnames(cor_matrix), y = rownames(cor_matrix), z = cor_matrix, 
                    type = "heatmap", colorscale = "Viridis") %>%
  layout(title = "Correlation Heatmap of Numeric Variables")
plot_cor
missing_summary <- bank_additional_df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Missing_Count")

plot_missing <- plot_ly(missing_summary, x = ~Variable, y = ~Missing_Count, type = "bar", color = ~Variable) %>%
  layout(title = "Missing Values per Column",
         xaxis = list(title = "Variable"),
         yaxis = list(title = "Missing Count"))
plot_missing
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_box <- plot_ly(bank_additional_df, 
                    x = ~job, 
                    y = ~duration, 
                    type = 'box', 
                    color = ~job,
                    boxpoints = 'all', 
                    jitter = 0.3, 
                    pointpos = -1.8) %>%
  layout(title = "Call Duration by Job",
         xaxis = list(title = "Job"),
         yaxis = list(title = "Duration"))
plot_box
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
numeric_vars <- bank_additional_df %>% select(where(is.numeric))
numeric_long <- numeric_vars %>% pivot_longer(everything(), names_to = "Variable", values_to = "Value")

plot_numeric_hist <- plot_ly(numeric_long, 
                             x = ~Value, 
                             type = 'histogram', 
                             color = ~Variable, 
                             nbinsx = 30) %>%
  layout(barmode = "overlay",
         title = "Distribution of Numeric Variables",
         xaxis = list(title = "Value"),
         yaxis = list(title = "Count"))
plot_numeric_hist
# Relationships between numeric variables (scatter matrix)

plot_splom <- plot_ly(type = 'splom', 
                      dimensions = list(
                        list(label = 'Age', values = numeric_vars$age),
                        list(label = 'Duration', values = numeric_vars$duration),
                        list(label = 'Campaign', values = numeric_vars$campaign),
                        list(label = 'Pdays', values = numeric_vars$pdays),
                        list(label = 'Previous', values = numeric_vars$previous)
                      ),
                      marker = list(color = 'rgba(0, 100, 200, 0.5)', size = 5)) %>%
  layout(title = 'Scatterplot Matrix of Numeric Variables')
plot_splom